Round 1: Top-level cell type annotation

Can we distinguish cell types by surface proteins?

.hdr <- "result/step1/matrix_final"
.data <- fileset.list(.hdr)
.file <- "result/step2/prot_bbknn.rds"
if.needed(.file, {
    
    .batches <- .fread(.prot.data$col, col.names="tag")
    .batches[, c("barcode","batch") := tstrsplit(tag, "_")]

    .bbknn <- rcpp_mmutil_bbknn_pca(.prot.data$mtx,
                                    .batches$batch,
                                    knn = 10, RANK = 20,
                                    EM_ITER = 20,
                                    TAKE_LN = TRUE)
    saveRDS(.bbknn, .file)
})
.bbknn <- readRDS(.file)
  • nTconv : CD3+, CD4+, CD8-, CD25-/CD127+, CD45RA+/CD45RO-
  • mTconv : CD3+, CD4+, CD8-, CD25-/CD127+, CD45RA-/CD45RO+
  • nTreg : CD3+, CD4+, CD8-, CD25+/CD127-, CD45RA+/CD45RO-
  • mTreg : CD3+, CD4+, CD8-, CD25+/CD127-, CD45RA-/CD45RO+

Run annotation purely based on marker genes:

.file <- "result/step2/prot_annot.txt.gz"
if.needed(.file, {
    
    .pos.markers <- .read.marker.file("marker/surface_round1_positive.txt")
    .neg.markers <- .read.marker.file("marker/surface_round1_negative.txt")

    .annot.out <-
        rcpp_mmutil_annotate_columns(
            pos_labels = list(p1=.pos.markers),
            r_neg_labels = list(n1=.neg.markers),
            row_file = .prot.data$row,
            col_file = .prot.data$col,
            r_U = .bbknn$U,
            r_D = .bbknn$D,
            r_V = .bbknn$factors.adjusted,
            EM_TOL = 1e-6,
            EM_ITER = 500)
    annot.dt <- setDT(.annot.out$annotation)
    .col <- c("tag", "celltype", "prob", "ln.prob")
    names(annot.dt) <- .col
    annot.dt[, c("barcode","batch") := tstrsplit(tag, split="_")]
    fwrite(annot.dt, .file)
})
annot.dt <- fread(.file)

Two-dimensional density plot on the normalized CD marker expressions.

U <- .bbknn$U
D <- .bbknn$D
V <- .bbknn$factors.adjusted

prot.mtx <- pmax(exp(sweep(U, 2, D, `*`) %*% t(V)) - 1, 0)
prot.mtx <- apply(prot.mtx, 2, function(x) x/pmax(sum(x),1) * 1e4)

rownames(prot.mtx) <- readLines(.prot.data$row)
colnames(prot.mtx) <- readLines(.prot.data$col)

.ct <- c("mTreg","nTreg","mTconv","nTconv")
plt <- plt.scatter.ct.2(.ct, annot.dt, prot.mtx)
print(plt)

.file <- fig.dir %&% "/Fig_round1_cdmarker_bbknn.pdf"
.gg.save(filename = .file, plot = plt, width=6, height=5)

PDF

Two-dimensional density plot on the raw CD marker concentrations.

prot.raw.mtx <- read.dense(.prot.data$mtx)
rownames(prot.raw.mtx) <- readLines(.prot.data$row)
colnames(prot.raw.mtx) <- readLines(.prot.data$col)

.ct <- c("mTreg","nTreg","mTconv","nTconv")
plt <- plt.scatter.ct.2(.ct, annot.dt, prot.raw.mtx)
print(plt)

.file <- fig.dir %&% "/Fig_round1_cdmarker.pdf"
.gg.save(filename = .file, plot = plt, width=6, height=5)

PDF

Will the same classification results hold in transcriptomic data?

.file <- "result/step2/gene_bbknn.rds"

if.needed(.file, {

    .batches <- .fread(.gene.data$col, col.names="tag")
    .batches[, c("barcode","batch") := tstrsplit(tag, "_")]

    .bbknn <- rcpp_mmutil_bbknn_pca(.gene.data$mtx,
                                    .batches$batch,
                                    knn = 10, RANK = 30,
                                    EM_ITER = 20,
                                    TAKE_LN = TRUE,
                                    NUM_THREADS = 8)
    saveRDS(.bbknn, .file)
})
.bbknn <- readRDS(.file)
run.bbknn.umap <- function(.bbknn, .cells, ...){

    .df <- data.frame(from = .bbknn$knn$src.index,
                      to = .bbknn$knn$tgt.index,
                      weight = .bbknn$knn$weight)

    G <- igraph::graph_from_data_frame(.df, directed=FALSE)

    nrepeat <- 20
    C <- list(modularity = 0)
    for(r in 1:nrepeat){
        c.r <- igraph::cluster_louvain(G, resolution = .75)
        message(paste("modularity: ", c.r$modularity, "\n"))
        if(r == 1 || max(c.r$modularity) > max(C$modularity)){
            C <- c.r
        }
    }

    .clust <- data.table(tag = .cells[as.integer(C$names)],
                         membership = C$membership)

    A <- igraph::as_adj(G)

    umap.A <- uwot::optimize_graph_layout(A, verbose = TRUE, n_sgd_threads = "auto", ...)

    .dt <- data.table(tag = .cells[as.integer(rownames(A))],
                      umap1 = umap.A[,1],
                      umap2 = umap.A[,2]) %>% 
        left_join(.clust, by = "tag")
}
  1. Build a batch-balancing kNN graph

  2. Louvain graph-based clustering

  3. Construct UMAP the kNN backbone graph

.file <- "result/step2/gene_bbknn_louvain.txt.gz"
.cells <- readLines(.gene.data$col)
if.needed(.file, {
    .bbknn.umap <- run.bbknn.umap(.bbknn, .cells, spread=3)
    fwrite(.bbknn.umap, .file)
})
.bbknn.umap <- fread(.file) %>%
    left_join(annot.dt, by = "tag") %>% 
    mutate(membership = as.factor(membership))

Clustering patterns in scRNA-seq agrees with the classification results based on surface proteins

p1 <-
    .gg.plot(.bbknn.umap, aes(umap1, umap2, color=membership)) +
    geom_point(stroke=0, alpha=.8, size=.5)
    ggrastr::rasterise(geom_point(stroke=0, alpha=.8, size=.5), dpi=300)

geom_point: na.rm = FALSE stat_identity: na.rm = FALSE position_identity

p2 <-
    .gg.plot(.bbknn.umap, aes(umap1, umap2, color=celltype)) +
    ggrastr::rasterise(geom_point(stroke=0, alpha=.8, size=.5), dpi=300) +
    scale_color_brewer(palette = "Paired")

plt <- p1 | p2
print(plt)

.file <- fig.dir %&% "/Fig_bbknn_gene_only_umap.pdf"
.gg.save(filename = .file, plot = plt, width=8, height=3)

PDF

.ct.mem <- .bbknn.umap[, .(nc = .N), by = .(celltype, membership)]
.ct.mem[, ntot := sum(`nc`), by = .(membership)]
.ct.mem[, pr := `nc`/`ntot`]

.df <- 
    .ct.mem %>%
    mutate(row = celltype, col = membership,
           weight = logit(pmax(pmin(pr, 1-1e-2), 1e-2))) %>% 
    order.pair(ret.tab = TRUE)
plt <- 
    .gg.plot(.df, aes(col, row, fill=weight, size = `nc`)) +
    xlab("Louvain clustering") +
    ylab("cell type") +
    theme(legend.position = "bottom") +
    geom_point(pch=22) +
    scale_size_continuous("#cells", range=c(0,5)) +
    scale_fill_distiller("proportion",
                         palette="PuRd", direction = 1,
                         labels=function(x) round(sigmoid(x), 2))
print(plt)

.file <- fig.dir %&% "/Fig_bbknn_gene_only_cluster.pdf"
.gg.save(filename = .file, plot = plt, width=5, height=2)

PDF

Joint clustering analysis

.file <- "result/step2/full_bbknn.rds"

if.needed(.file, {

    .batches <- .fread(.data$col, col.names="tag")
    .batches[, c("barcode","batch") := tstrsplit(tag, "_")]

    .bbknn <- rcpp_mmutil_bbknn_pca(.data$mtx,
                                    .batches$batch,
                                    knn = 10, RANK = 30,
                                    EM_ITER = 20,
                                    TAKE_LN = TRUE,
                                    NUM_THREADS = 8)
    saveRDS(.bbknn, .file)
})
.bbknn <- readRDS(.file)

1. Run annotation on the joint BBKNN data

.file <- "result/step2/prot_annot_full.txt.gz"
if.needed(.file, {
    
    .pos.markers <- .read.marker.file("marker/surface_round1_positive.txt")
    .neg.markers <- .read.marker.file("marker/surface_round1_negative.txt")

    .annot.out <-
        rcpp_mmutil_annotate_columns(
            pos_labels = list(p1=.pos.markers),
            r_neg_labels = list(n1=.neg.markers),
            row_file = .data$row,
            col_file = .data$col,
            r_U = .bbknn$U,
            r_D = .bbknn$D,
            r_V = .bbknn$factors.adjusted,
            EM_TOL = 1e-6,
            EM_ITER = 500)
    annot.dt <- setDT(.annot.out$annotation)
    .col <- c("tag", "celltype", "prob", "ln.prob")
    names(annot.dt) <- .col
    annot.dt[, c("barcode","batch") := tstrsplit(tag, split="_")]
    fwrite(annot.dt, .file)
})
annot.dt <- fread(.file)

Two-dimensional density plot on the raw CD marker concentrations.

prot.raw.mtx <- read.dense(.prot.data$mtx)
rownames(prot.raw.mtx) <- readLines(.prot.data$row)
colnames(prot.raw.mtx) <- readLines(.prot.data$col)

.ct <- c("mTreg","nTreg","mTconv","nTconv")
plt <- plt.scatter.ct.2(.ct, annot.dt, prot.raw.mtx)
print(plt)

.file <- fig.dir %&% "/Fig_round1_cdmarker_full.pdf"
.gg.save(filename = .file, plot = plt, width=6, height=5)

PDF

2. Graph-based clustering

.file <- "result/step2/full_bbknn_louvain.txt.gz"
.cells <- readLines(.data$col)
if.needed(.file, {
    .bbknn.umap <- run.bbknn.umap(.bbknn, .cells, spread=3)
    fwrite(.bbknn.umap, .file)
})
.bbknn.umap <- fread(.file) %>%
    left_join(annot.dt, by = "tag") %>% 
    mutate(membership = as.factor(membership))
p1 <-
    .gg.plot(.bbknn.umap, aes(umap1, umap2, color=membership)) +
    ggrastr::rasterise(geom_point(stroke=0, alpha=.8, size=.5), dpi=300)

p2 <-
    .gg.plot(.bbknn.umap, aes(umap1, umap2, color=celltype)) +
    ggrastr::rasterise(geom_point(stroke=0, alpha=.8, size=.5), dpi=300) +
    scale_color_brewer(palette = "Paired")

plt <- p1 | p2
print(plt)

.file <- fig.dir %&% "/Fig_bbknn_full_umap.pdf"
.gg.save(filename = .file, plot = plt, width=8, height=3)

PDF

.ct.mem <- .bbknn.umap[, .(nc = .N), by = .(celltype, membership)]
.ct.mem[, ntot := sum(`nc`), by = .(membership)]
.ct.mem[, pr := `nc`/`ntot`]

.df <- 
    .ct.mem %>%
    mutate(row = celltype, col = membership,
           weight = logit(pmax(pmin(pr, 1-1e-2), 1e-2))) %>% 
    order.pair(ret.tab = TRUE)

plt <-
    .gg.plot(.df, aes(col, row, fill=weight, size = `nc`)) +
    xlab("Louvain clustering") +
    ylab("cell type") +
    theme(legend.position = "bottom") +
    geom_point(pch=22) +
    scale_size_continuous("#cells", range=c(0,5)) +
    scale_fill_distiller("proportion",
                         palette="PuRd", direction = 1,
                         labels=function(x) round(sigmoid(x), 2))
print(plt)

.file <- fig.dir %&% "/Fig_bbknn_full_cluster.pdf"
.gg.save(filename = .file, plot = plt, width=5, height=2)

PDF

3. Refine cell type assignments with the clustering results

.valid.clust <- .ct.mem[order(pr, decreasing = T),
                        head(.SD, 1),
                        by = .(membership)]

.valid.celltype.cluster <-
    .valid.clust[pr > .5, .(membership, celltype)] %>%
    unique()

celltype.final <- 
    .bbknn.umap %>%
    rename(celltype.protein = celltype) %>%
    (function(x) left_join(.valid.celltype.cluster, x)) %>%
    na.omit()

DOWNLOAD: Cell type estimation

p1 <-
    .gg.plot(celltype.final, aes(umap1, umap2, color=membership)) +
    ggrastr::rasterise(geom_point(stroke=0, alpha=.8, size=.5), dpi=300)

p2 <-
    .gg.plot(celltype.final, aes(umap1, umap2, color=celltype)) +
    ggrastr::rasterise(geom_point(stroke=0, alpha=.8, size=.5), dpi=300) +
    scale_color_brewer(palette = "Paired")

plt <- p1 | p2
print(plt)

.file <- fig.dir %&% "/Fig_bbknn_final_umap.pdf"
.gg.save(filename = .file, plot = plt, width=8, height=3)

PDF

4. Confirm by other marker genes/proteins

.markers <-
    c("FOXP3", "ID3", "BACH2", "CXCR3", "PRDM1", "SGK1", "TCF7", "LEF1",
      "SELL", "IL2RA", "IL7R", "IKZF2", "CCR6", "CCR4", "CCR7", "CTLA4",
      "HLA-DRA", "anti_CD25", "anti_CD127", "anti_CD183", "anti_CD196",
      "anti_CD197", "anti_CD194", "anti_CD45RA", "anti_CD45RO",
      "anti_HLA") %>%
    unique
.marker.hdr <- "result/step2/marker/marker"
.mkdir(dirname(.marker.hdr))
.marker.data <- fileset.list(.marker.hdr)
if.needed(.marker.data, {
    .marker.data <-
        rcpp_mmutil_copy_selected_rows(.data$mtx,
                                       .data$row,
                                       .data$col,
                                       .markers,
                                       .marker.hdr)
})

Note: these plots show expression values without batch adjustment

.markers <- sort(as.character(unique(marker.dt$marker)))
for(g in .markers){
    plt <- plot.marker.umap(g)
    print(plt)
    .file <- fig.dir %&% "/Fig_marker_" %&% g %&% "_umap.pdf"
    .gg.save(filename = .file, plot = plt, width=3.2, height=2.8)
}

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

Basic statistics for the first round annotation (23,341 cells)

.hash.info <- fread("data/cell_info_batch3-7.csv.gz") %>%
    rename(tag = cbid, subject = id) %>%
    select(-`V1`) %>%
    mutate(disease = substr(`subject`, 1, 2)) %>% 
    mutate(hash = gsub(pattern="[a-zA-Z\\-]+", replacement="", `hash`))

.sample.info <-
    readxl::read_xlsx("data/Hashing list MS Treg project.xlsx", 1) %>% 
    rename(Sample = `Cell type`) %>%
    mutate(hash = gsub("#","",`hash`))

.dt <-
    celltype.final %>%
    mutate(batch=as.integer(batch)) %>%
    left_join(.hash.info) %>%
    mutate(ct=factor(celltype, c("nTconv","mTconv","nTreg","mTreg"))) %>% 
    left_join(.sample.info)

Combining all the experiments

PDF

PDF

Total CD4 samples

PDF

PDF

CD25 enriched samples

PDF

PDF